{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ba394302",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# From Expectations to Synthetic Data generation\n",
    "\n",
    "## 3. Synthetic data & expectations\n",
    "\n",
    "After the generation of the synthetic data, we need to assess the quality of the data. For the purpose of this flow we are only going to focus on the data Fidelity assesment both with `pandas-profiling` and `great-expectations`\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8c1de75c-0578-416b-beea-2e55665f0559",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "### The dataset - Real and Synthetic data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "81d0987d-1eab-42bd-8bbb-f58df29c277f",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "dataset_name = \"BankChurn\"\n",
    "real = pd.read_csv('BankChurners.csv')\n",
    "synth = pd.read_csv(f'synth_{dataset_name}', index_col=0)\n",
    "\n",
    "#Read the json_profiling from the real data\n",
    "f = open(f'.profile_{dataset_name}.json')\n",
    "json_profile = json.load(f)\n",
    "json_profile = json.loads(json_profile)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "919c534a-cc3b-4ccd-8534-5e0a116c5415",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "   CLIENTNUM     Attrition_Flag  Customer_Age Gender  Dependent_count  \\\n0  768805383  Existing Customer            45      M                3   \n1  818770008  Existing Customer            49      F                5   \n2  713982108  Existing Customer            51      M                3   \n3  769911858  Existing Customer            40      F                4   \n4  709106358  Existing Customer            40      M                3   \n\n  Education_Level Marital_Status Income_Category Card_Category  \\\n0     High School        Married     $60K - $80K          Blue   \n1        Graduate         Single  Less than $40K          Blue   \n2        Graduate        Married    $80K - $120K          Blue   \n3     High School        Unknown  Less than $40K          Blue   \n4      Uneducated        Married     $60K - $80K          Blue   \n\n   Months_on_book  ...  Credit_Limit  Total_Revolving_Bal  Avg_Open_To_Buy  \\\n0              39  ...       12691.0                  777          11914.0   \n1              44  ...        8256.0                  864           7392.0   \n2              36  ...        3418.0                    0           3418.0   \n3              34  ...        3313.0                 2517            796.0   \n4              21  ...        4716.0                    0           4716.0   \n\n   Total_Amt_Chng_Q4_Q1  Total_Trans_Amt  Total_Trans_Ct  Total_Ct_Chng_Q4_Q1  \\\n0                 1.335             1144              42                1.625   \n1                 1.541             1291              33                3.714   \n2                 2.594             1887              20                2.333   \n3                 1.405             1171              20                2.333   \n4                 2.175              816              28                2.500   \n\n   Avg_Utilization_Ratio  \\\n0                  0.061   \n1                  0.105   \n2                  0.000   \n3                  0.760   \n4                  0.000   \n\n   Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1  \\\n0                                           0.000093                                                                                    \n1                                           0.000057                                                                                    \n2                                           0.000021                                                                                    \n3                                           0.000134                                                                                    \n4                                           0.000022                                                                                    \n\n   Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2  \n0                                            0.99991                                                                                   \n1                                            0.99994                                                                                   \n2                                            0.99998                                                                                   \n3                                            0.99987                                                                                   \n4                                            0.99998                                                                                   \n\n[5 rows x 23 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>CLIENTNUM</th>\n      <th>Attrition_Flag</th>\n      <th>Customer_Age</th>\n      <th>Gender</th>\n      <th>Dependent_count</th>\n      <th>Education_Level</th>\n      <th>Marital_Status</th>\n      <th>Income_Category</th>\n      <th>Card_Category</th>\n      <th>Months_on_book</th>\n      <th>...</th>\n      <th>Credit_Limit</th>\n      <th>Total_Revolving_Bal</th>\n      <th>Avg_Open_To_Buy</th>\n      <th>Total_Amt_Chng_Q4_Q1</th>\n      <th>Total_Trans_Amt</th>\n      <th>Total_Trans_Ct</th>\n      <th>Total_Ct_Chng_Q4_Q1</th>\n      <th>Avg_Utilization_Ratio</th>\n      <th>Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1</th>\n      <th>Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>768805383</td>\n      <td>Existing Customer</td>\n      <td>45</td>\n      <td>M</td>\n      <td>3</td>\n      <td>High School</td>\n      <td>Married</td>\n      <td>$60K - $80K</td>\n      <td>Blue</td>\n      <td>39</td>\n      <td>...</td>\n      <td>12691.0</td>\n      <td>777</td>\n      <td>11914.0</td>\n      <td>1.335</td>\n      <td>1144</td>\n      <td>42</td>\n      <td>1.625</td>\n      <td>0.061</td>\n      <td>0.000093</td>\n      <td>0.99991</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>818770008</td>\n      <td>Existing Customer</td>\n      <td>49</td>\n      <td>F</td>\n      <td>5</td>\n      <td>Graduate</td>\n      <td>Single</td>\n      <td>Less than $40K</td>\n      <td>Blue</td>\n      <td>44</td>\n      <td>...</td>\n      <td>8256.0</td>\n      <td>864</td>\n      <td>7392.0</td>\n      <td>1.541</td>\n      <td>1291</td>\n      <td>33</td>\n      <td>3.714</td>\n      <td>0.105</td>\n      <td>0.000057</td>\n      <td>0.99994</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>713982108</td>\n      <td>Existing Customer</td>\n      <td>51</td>\n      <td>M</td>\n      <td>3</td>\n      <td>Graduate</td>\n      <td>Married</td>\n      <td>$80K - $120K</td>\n      <td>Blue</td>\n      <td>36</td>\n      <td>...</td>\n      <td>3418.0</td>\n      <td>0</td>\n      <td>3418.0</td>\n      <td>2.594</td>\n      <td>1887</td>\n      <td>20</td>\n      <td>2.333</td>\n      <td>0.000</td>\n      <td>0.000021</td>\n      <td>0.99998</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>769911858</td>\n      <td>Existing Customer</td>\n      <td>40</td>\n      <td>F</td>\n      <td>4</td>\n      <td>High School</td>\n      <td>Unknown</td>\n      <td>Less than $40K</td>\n      <td>Blue</td>\n      <td>34</td>\n      <td>...</td>\n      <td>3313.0</td>\n      <td>2517</td>\n      <td>796.0</td>\n      <td>1.405</td>\n      <td>1171</td>\n      <td>20</td>\n      <td>2.333</td>\n      <td>0.760</td>\n      <td>0.000134</td>\n      <td>0.99987</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>709106358</td>\n      <td>Existing Customer</td>\n      <td>40</td>\n      <td>M</td>\n      <td>3</td>\n      <td>Uneducated</td>\n      <td>Married</td>\n      <td>$60K - $80K</td>\n      <td>Blue</td>\n      <td>21</td>\n      <td>...</td>\n      <td>4716.0</td>\n      <td>0</td>\n      <td>4716.0</td>\n      <td>2.175</td>\n      <td>816</td>\n      <td>28</td>\n      <td>2.500</td>\n      <td>0.000</td>\n      <td>0.000022</td>\n      <td>0.99998</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 23 columns</p>\n</div>"
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "real.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "615aff3a-53e3-4329-90fb-ca01f124f2bc",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "   Attrition_Flag  Customer_Age Gender  Dependent_count Education_Level  \\\n0               1            18      F                0         Unknown   \n1               1             7      F               -4         Unknown   \n2               1            14      F                0         Unknown   \n3               1             0      F               -3         Unknown   \n4               1            15      F               -2         Unknown   \n\n  Marital_Status Income_Category Card_Category  Months_on_book  \\\n0         Single  Less than $40K          Blue               0   \n1         Single  Less than $40K          Gold             -22   \n2         Single  Less than $40K          Blue              -3   \n3         Single  Less than $40K          Gold             -34   \n4         Single  Less than $40K          Blue             -16   \n\n   Total_Relationship_Count  ...  Credit_Limit  Total_Revolving_Bal  \\\n0                         3  ... -19194.160156                  399   \n1                         4  ... -31210.564453                  526   \n2                         3  ... -17750.167969                 -271   \n3                         3  ... -49034.042969                -2627   \n4                         5  ... -17883.761719                  107   \n\n   Avg_Open_To_Buy  Total_Amt_Chng_Q4_Q1  Total_Trans_Amt  Total_Trans_Ct  \\\n0    -13395.407227             -0.073560             4348              44   \n1     -6185.100586             -0.890604             8321              47   \n2    -10119.250977             -0.778567             2598              33   \n3    -26602.591797             -0.920144             4179              38   \n4    -10018.253906             -0.185765             5535              38   \n\n   Total_Ct_Chng_Q4_Q1  Avg_Utilization_Ratio  \\\n0            -0.152840              -0.025812   \n1            -2.072164               0.624603   \n2            -0.475148              -0.171177   \n3            -2.743991              -0.244227   \n4            -0.727267               0.367464   \n\n   Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1  \\\n0                                          -0.095610                                                                                    \n1                                          -0.374166                                                                                    \n2                                           0.035958                                                                                    \n3                                          -0.335357                                                                                    \n4                                          -0.063643                                                                                    \n\n   Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2  \n0                                           0.167355                                                                                   \n1                                          -0.027725                                                                                   \n2                                          -0.112372                                                                                   \n3                                          -0.753540                                                                                   \n4                                          -0.026639                                                                                   \n\n[5 rows x 22 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Attrition_Flag</th>\n      <th>Customer_Age</th>\n      <th>Gender</th>\n      <th>Dependent_count</th>\n      <th>Education_Level</th>\n      <th>Marital_Status</th>\n      <th>Income_Category</th>\n      <th>Card_Category</th>\n      <th>Months_on_book</th>\n      <th>Total_Relationship_Count</th>\n      <th>...</th>\n      <th>Credit_Limit</th>\n      <th>Total_Revolving_Bal</th>\n      <th>Avg_Open_To_Buy</th>\n      <th>Total_Amt_Chng_Q4_Q1</th>\n      <th>Total_Trans_Amt</th>\n      <th>Total_Trans_Ct</th>\n      <th>Total_Ct_Chng_Q4_Q1</th>\n      <th>Avg_Utilization_Ratio</th>\n      <th>Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1</th>\n      <th>Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>18</td>\n      <td>F</td>\n      <td>0</td>\n      <td>Unknown</td>\n      <td>Single</td>\n      <td>Less than $40K</td>\n      <td>Blue</td>\n      <td>0</td>\n      <td>3</td>\n      <td>...</td>\n      <td>-19194.160156</td>\n      <td>399</td>\n      <td>-13395.407227</td>\n      <td>-0.073560</td>\n      <td>4348</td>\n      <td>44</td>\n      <td>-0.152840</td>\n      <td>-0.025812</td>\n      <td>-0.095610</td>\n      <td>0.167355</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>7</td>\n      <td>F</td>\n      <td>-4</td>\n      <td>Unknown</td>\n      <td>Single</td>\n      <td>Less than $40K</td>\n      <td>Gold</td>\n      <td>-22</td>\n      <td>4</td>\n      <td>...</td>\n      <td>-31210.564453</td>\n      <td>526</td>\n      <td>-6185.100586</td>\n      <td>-0.890604</td>\n      <td>8321</td>\n      <td>47</td>\n      <td>-2.072164</td>\n      <td>0.624603</td>\n      <td>-0.374166</td>\n      <td>-0.027725</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>1</td>\n      <td>14</td>\n      <td>F</td>\n      <td>0</td>\n      <td>Unknown</td>\n      <td>Single</td>\n      <td>Less than $40K</td>\n      <td>Blue</td>\n      <td>-3</td>\n      <td>3</td>\n      <td>...</td>\n      <td>-17750.167969</td>\n      <td>-271</td>\n      <td>-10119.250977</td>\n      <td>-0.778567</td>\n      <td>2598</td>\n      <td>33</td>\n      <td>-0.475148</td>\n      <td>-0.171177</td>\n      <td>0.035958</td>\n      <td>-0.112372</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>1</td>\n      <td>0</td>\n      <td>F</td>\n      <td>-3</td>\n      <td>Unknown</td>\n      <td>Single</td>\n      <td>Less than $40K</td>\n      <td>Gold</td>\n      <td>-34</td>\n      <td>3</td>\n      <td>...</td>\n      <td>-49034.042969</td>\n      <td>-2627</td>\n      <td>-26602.591797</td>\n      <td>-0.920144</td>\n      <td>4179</td>\n      <td>38</td>\n      <td>-2.743991</td>\n      <td>-0.244227</td>\n      <td>-0.335357</td>\n      <td>-0.753540</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>1</td>\n      <td>15</td>\n      <td>F</td>\n      <td>-2</td>\n      <td>Unknown</td>\n      <td>Single</td>\n      <td>Less than $40K</td>\n      <td>Blue</td>\n      <td>-16</td>\n      <td>5</td>\n      <td>...</td>\n      <td>-17883.761719</td>\n      <td>107</td>\n      <td>-10018.253906</td>\n      <td>-0.185765</td>\n      <td>5535</td>\n      <td>38</td>\n      <td>-0.727267</td>\n      <td>0.367464</td>\n      <td>-0.063643</td>\n      <td>-0.026639</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 22 columns</p>\n</div>"
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "synth.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aeb7da22-86b6-4875-a285-f084c56da6d6",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "#### Profiling the synthetic data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bc823b92-9103-4232-8eee-aaf49f652d0c",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from pandas_profiling import ProfileReport"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "72c16eaa-bc57-43d0-b4b2-95f6c3180a04",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "title = f\"Synth: {dataset_name}\"\n",
    "synth_profile = ProfileReport(synth, title=title)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "9e3de6f2-f16c-442e-babf-d48923898104",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": "Summarize dataset:   0%|          | 0/35 [00:00<?, ?it/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "a45fa1fcbb224c5381fdd484990cd303"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "953c48dea80d44a2aa8ec9d8102a21d0"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "73d6d49b8211449891428429df007fc5"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": "Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]",
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "e6864f84c1c6418d9cd652f3579f819f"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "synth_profile.to_file('synth_profile.html')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6460dc2a-9dc1-49ad-bc96-cd7bce7b7358",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "#### Running the expectations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ce5fa722-8614-4d8a-9f3d-bdec244322ea",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import great_expectations as ge\n",
    "\n",
    "data_context = ge.data_context.DataContext(context_root_dir=\"great_expectations\")\n",
    "\n",
    "#Loading the previously build suite\n",
    "suite = data_context.get_expectation_suite(f\"{dataset_name}_expectations\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ecdb1a86-88ca-461d-92c6-5311befec388",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "batch = ge.dataset.PandasDataset(synth, expectation_suite=suite)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2e45447d-9d36-4630-b217-b79bfb08b4e3",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "results = data_context.run_validation_operator(\n",
    "    \"action_list_operator\", assets_to_validate=[batch]\n",
    ")\n",
    "validation_result_identifier = results.list_validation_result_identifiers()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1be6e0fa-0f5e-4fa0-97c0-278d2a5f991f",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "#Building & openning the Data Docs\n",
    "data_context.build_data_docs()\n",
    "data_context.open_data_docs(validation_result_identifier)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "profiling",
   "language": "python",
   "name": "profiling"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}